﻿__author__ = 'wens'
#!/usr/bin/python
# -*- coding=utf-8 -*-
import urllib2
import re
import gzip
import StringIO
import csv
import xlrd
import xlwt


fname = "urllist.xls"
print "打开文档，写入数据"
bk = xlrd.open_workbook(fname)

shxrange = range(bk.nsheets)
sheetname = "Sheet1"

try:
    sh = bk.sheet_by_name(sheetname)
except:
    print "no sheet in %s named Sheet1" % fname
print "获取数据"

#获取行数
nrows = sh.nrows
#获取列数
ncols = sh.ncols
print "nrows %d, ncols %d" % (nrows,ncols)

i=0
dikuaibianma = "" #地块编码
dikuaiweizhi = "" #地块位置
tudiyongtu = "" #土地用途
area=""#土地面积（公顷）
selldate=""#出让年限
sellPrice=""#成交价
unit =""#受让单位src
des = ""#备注
ptime=""#发布时间
#获取所有网址

urldic=[]
while i<nrows:
    rows = sh.row_values(i)
    urlstr=rows[0]
    urldic.append(urlstr)
    i+=1

#删除重复的
print "删除重复前:"+str(len(urldic))
uselist =  list(set(urldic))
print "删除重复后:"+str(len(uselist))
nrows = len(uselist)
allDic=[]


i=0
while i<nrows :
    urlstr=uselist[i]

    print "**************************************************************"
    print str(i)+ u" 打开网页:"+urlstr
    i+=1

    response = urllib2.urlopen(urlstr)

    strl = response.read()
    response.close()
    pstrl = unicode(strl,"gb2312",'ignore').encode("utf8")
    strl = pstrl
    strl=strl.replace("\r\n","")
    strl=strl.replace(" ","")
    p=re.compile('\s+')
    strl=re.sub(p,'',strl)
    s="====================="
    lencount =2
    results = strl.split("地块编号")
    #==================================================================

    finddx = strl.find("项目名称")
    print "================================="
    print strl
    #获取发布时间
    sltimes = strl.split("发布时间：")
    ptime=""
    if len(sltimes)>1:
        nstr = sltimes[1]
        sltimes = nstr.split("<")
        ptime=sltimes[0]
    print "================================="

    if finddx<0:

        print len(results)
        ldj ="找到了"
        if(len(results) ==lencount):
            vki = 1
            while vki<len(results):
                clk = results[vki]
                vki+=1
                clk = clk.replace("</td>","")
                finds = clk.split("<td>")
                ind = 0
                for vk in finds:
                    print str(ind)+s
                    print vk
                    ind+=1

                nki=0
                lop=0
                while lop<len(finds):
                    ntr = finds[lop]
                    lop+=1
                    finddx = ntr.find("工业用地")
                    if finddx>=0:
                        nki=1
                        break

                if nki==1:
                    continue
                if finds[1]=="地块位置" and finds[2]=="土地面积(公顷)":
                    dikuaibianma=finds[7]
                    dikuaiweizhi=finds[8]
                    tudiyongtu=finds[10]
                    area=finds[9]
                    selldate=finds[11]
                    sellPrice=finds[12]
                    unit=finds[13]
                    des=""
                else:
                    ustr = finds[1].split("<")
                    dikuaibianma = ustr[0] #获取地块编码
                    #===========================
                    ustr=  finds[2].split("<") #获取地块位置
                    dikuaiweizhi = ustr[0]
                    #===========================
                    ustr=  finds[3].split("<") #获取土地用途
                    tudiyongtu = ustr[0]
                    #===========================
                    ustr=  finds[5].split("<") #获取土地面积
                    area = ustr[0]
                    #===========================
                    ustr=  finds[7].split("<") #获取出让年限
                    selldate = ustr[0]
                    #===========================
                    ustr=  finds[9].split("<") #获取成交价
                    sellPrice= ustr[0]
                    #===========================
                    ustr=  finds[10].split(">") #获取受让单位
                    unit = ustr[1].split("<")[0]
                    #===========================
                    ntr = finds[11]
                    fdx = ntr.find("土地使用条件")
                    des=""
                    if fdx>=0:
                        ntr = ntr.replace("</tr","")
                        ustr= ntr.split(">") #获取备注
                        des=ustr[1]
                        des="土地使用条件:"+des
                    else:
                        fdx = ntr.find("备注")
                        if fdx<0:
                            if len(finds)>12:
                                ntr=finds[12]
                                ustr= ntr.split("nbsp;") #获取备注
                                des=ustr[1].split("<")[0]
                            else:
                                des=""



                objDic=[]
                objDic.append(ptime)
                objDic.append(dikuaibianma)
                objDic.append(dikuaiweizhi)
                objDic.append(tudiyongtu)
                objDic.append(area)
                objDic.append(selldate)
                objDic.append(sellPrice)
                objDic.append(unit)
                objDic.append(des)

                allDic.append(objDic)
                print "==============================="
                print "  "
                print  "公示时间："+ptime
                print  "地块编码："+dikuaibianma
                print  "地块位置："+dikuaiweizhi
                print  "土地用途："+tudiyongtu
                print  "土地面积："+area
                print  "出让年限："+selldate
                print  "成交价："+sellPrice
                print  "受让单位："+unit
                print  "备注："+des

#读取xls
''' 读取xls
#=================保持到xlsx
fname = "demo.xls"
print "打开文档，写入数据"
bk = xlrd.open_workbook(fname)

shxrange = range(bk.nsheets)
sheetname = "sheet1"

try:
    sh = bk.sheet_by_name(sheetname)
except:
    print "no sheet in %s named Sheet1" % fname
print "获取数据"

#获取行数
nrows = sh.nrows
#获取列数
ncols = sh.ncols
print "nrows %d, ncols %d" % (nrows,ncols)
#===========================================================
'''

f= xlwt.Workbook()#创建工作簿
sheetname = "成交公示"
sheet = f.add_sheet(unicode(sheetname,'utf-8'),cell_overwrite_ok=True)
#创建表头
sheet.write(0,0,u'公示时间')
sheet.write(0,1,u'地块编码')
sheet.write(0,2,u"地块位置")
sheet.write(0,3,u"土地用途")
sheet.write(0,4,u"土地面积(公顷)")
sheet.write(0,5,u"出让年限")
sheet.write(0,6,u"成交价")
sheet.write(0,7,u"受让单位")
sheet.write(0,8,u"备注")

#写入
'''
print "================="
i=1
while i<nrows:
    rows = sh.row_values(i)
    sheet.write(i,0,rows[0])
    sheet.write(i,1,rows[1])
    sheet.write(i,2,rows[2])
    sheet.write(i,3,rows[3])
    sheet.write(i,4,rows[4])
    sheet.write(i,5,rows[5])
    sheet.write(i,6,rows[6])
    sheet.write(i,7,rows[7])
    i+=1

oldDic=[]
i=1
while i<nrows:
    rdic = []
    rows = sh.row_values(i)
    rdic.append(rows[0])
    rdic.append(rows[1])
    rdic.append(rows[2])
    rdic.append(rows[3])
    rdic.append(rows[4])
    rdic.append(rows[5])
    rdic.append(rows[6])
    rdic.append(rows[7])
    oldDic.append(rdic)
    i+=1

ishave = 0
i=0

while i<len(oldDic):
    rdics = oldDic[i]
    i+=1
    bm = rdics[0]
    if bm == dikuaibianma:
        ishave = 1
        i=100000
        print "==============已经拥有了"
'''

nrows = len(allDic)
i=0
while i<nrows:
    wdic = allDic[i]
    k=i+1
    print "新的数据，开始写入档案"
    sheet.write(k,0,unicode(wdic[0],'utf-8'))
    sheet.write(k,1,unicode(wdic[1],'utf-8'))
    sheet.write(k,2,unicode(wdic[2],'utf-8'))
    sheet.write(k,3,unicode(wdic[3],'utf-8'))
    sheet.write(k,4,unicode(wdic[4],'utf-8'))
    sheet.write(k,5,unicode(wdic[5],'utf-8'))
    sheet.write(k,6,unicode(wdic[6],'utf-8'))
    sheet.write(k,7,unicode(wdic[7],'utf-8'))
    sheet.write(k,8,unicode(wdic[8],'utf-8'))
    i+=1

f.save( u'顺德土地信息.xls')





